library(ggplot2)
library(dplyr)
library(tidyverse)



# Loading data 
mlst_file <- read.table(file="mlst_ab_freq_wored100.tsv",header=FALSE,sep="\t", col.names = c("ID","mlst"))
subregion_file <- read.table(file="metadata_ab_subregions.tsv", header = TRUE, sep = "\t", quote = ",") # Metadata file subtracted from Suppl. Table S1

mlst_md <- merge(mlst_file, subregion_file, by = "ID") # Join both dataframes

# Filter 
count<- mlst_md %>% group_by(mlst,Isolation.type) %>%tally() # Count
count<- count %>% mutate(per= round(n/sum(n)*100,2)) # Calculate percentage
count <- replace(count, count=="","Unknown") 

# Draw plot
plot<-ggplot(count, aes(x = factor(mlst,levels = c("ST2","ST79","ST1","ST3","ST499","ST10","ST78","ST25")), y= per, fill=Isolation.type)) + geom_bar(stat='identity') + 
  geom_text(aes(label=n),size = 4, position=position_stack(vjust=.5)) + theme_minimal() + labs(x = "", y = "Percent of genomes", fill = "Subregions")+
  scale_fill_manual(values=c(c("#104E8B","steelblue2","#98F5FF","aquamarine3","firebrick1","#FF6A6A","#FF3E96","#FFA07A","darkgoldenrod1","#CDAD00","#68228B","#AB82FF","#66CD00","gray90")), na.value = "gray90") + ylim(0,100.1) + theme(axis.text.x= element_text(size= 13), axis.text.y = element_text(size=11))

pdf("figS2.pdf",width = 8, height= 10, paper = 'special')
plot
dev.off()
